Fit temperature models and predict growing season temperature

Author

Max Lindmark, Jan Ohlberger, Anna Gårdmark

Published

June 27, 2023

Load libraries

Code
pkgs <- c("here","tidyverse", "tidylog", "RColorBrewer", "viridis", "sdmTMB", "sdmTMBextra", "patchwork", "RCurl", "tidylog") 

# minpack.lm needed if using nlsLM()
if(length(setdiff(pkgs,rownames(installed.packages()))) > 0){

    install.packages(setdiff(pkgs, rownames(installed.packages())), dependencies = T)
  
  }

invisible(lapply(pkgs, library,character.only = T))
#> here() starts at /Users/maxlindmark/Dropbox/Max work/R/perch-growth
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.2     ✔ readr     2.1.4
#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
#> ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
#> ✔ purrr     1.0.1     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#> 
#> Attaching package: 'tidylog'
#> 
#> 
#> The following objects are masked from 'package:dplyr':
#> 
#>     add_count, add_tally, anti_join, count, distinct, distinct_all,
#>     distinct_at, distinct_if, filter, filter_all, filter_at, filter_if,
#>     full_join, group_by, group_by_all, group_by_at, group_by_if,
#>     inner_join, left_join, mutate, mutate_all, mutate_at, mutate_if,
#>     relocate, rename, rename_all, rename_at, rename_if, rename_with,
#>     right_join, sample_frac, sample_n, select, select_all, select_at,
#>     select_if, semi_join, slice, slice_head, slice_max, slice_min,
#>     slice_sample, slice_tail, summarise, summarise_all, summarise_at,
#>     summarise_if, summarize, summarize_all, summarize_at, summarize_if,
#>     tally, top_frac, top_n, transmute, transmute_all, transmute_at,
#>     transmute_if, ungroup
#> 
#> 
#> The following objects are masked from 'package:tidyr':
#> 
#>     drop_na, fill, gather, pivot_longer, pivot_wider, replace_na,
#>     spread, uncount
#> 
#> 
#> The following object is masked from 'package:stats':
#> 
#>     filter
#> 
#> 
#> Loading required package: viridisLite
#> 
#> 
#> Attaching package: 'sdmTMBextra'
#> 
#> 
#> The following objects are masked from 'package:sdmTMB':
#> 
#>     dharma_residuals, extract_mcmc
#> 
#> 
#> 
#> Attaching package: 'RCurl'
#> 
#> 
#> The following object is masked from 'package:tidyr':
#> 
#>     complete

# devtools::install_github("seananderson/ggsidekick") # not on CRAN 
library(ggsidekick)
theme_set(theme_sleek())

# Set path:
home <- here::here()

Load cache

Code
# qwraps2::lazyload_cache_dir(path = paste0(home, "/R/analyze-data/01-fit-temp-models-predict_cache/html"))

Read data

Code
d <- read_csv(paste0(home, "/output/temp_data_for_fitting.csv"))
#> Rows: 98616 Columns: 13
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr  (4): area, source, db, id
#> dbl  (8): year, temp, yday, month, day, VkDag, stn_nr, section_nr
#> date (1): date
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d <- d |> mutate(area = as.factor(area),
                 source_f = as.factor(source),
                 year_f = as.factor(year))
#> mutate: converted 'area' from character to factor (0 new NA)
#>         new variable 'source_f' (factor) with 3 unique values and 0% NA
#>         new variable 'year_f' (factor) with 83 unique values and 0% NA

# Drop VN, no logger data?
d |> group_by(area) |> summarise(n_source = length(unique(source_f))) 
#> group_by: one grouping variable (area)
#> summarise: now 12 rows and 2 columns, ungrouped
#> # A tibble: 12 × 2
#>    area  n_source
#>    <fct>    <int>
#>  1 BS           3
#>  2 BT           3
#>  3 FB           3
#>  4 FM           3
#>  5 HO           3
#>  6 JM           3
#>  7 MU           3
#>  8 RA           3
#>  9 SI_EK        3
#> 10 SI_HA        3
#> 11 TH           3
#> 12 VN           2

# Drop data in SI_HA and BT before onset of warming?
d2 <- d |>
  mutate(discard = "N",
         discard = ifelse(area == "BT" & year <= 1980, "Y", discard),
         discard = ifelse(area == "SI_HA" & year <= 1972, "Y", discard)) |> 
  filter(discard == "N")
#> mutate: new variable 'discard' (character) with 2 unique values and 0% NA
#> filter: removed 1,146 rows (1%), 97,470 rows remaining

Fit models

Source as independent or interactive effect

Code
# library(brms)
# # Too slow...
# testm <- brm(temp ~ area*year_f + source_f + s(yday, by = area, bs = "cc"), 
#              data = d |> sample_n(5000),
#              family = student(),
#              chains = 2,
#              cores = 2,
#              knots = list(yday = c(0.5, 364.5))
#              )
# 
# summary(testm)
# 
# detach("package:brms", unload = TRUE)
# 
# library(QRM)
# 
# fit.st(data = d |> sample_n(500))
Code
m <- sdmTMB(temp ~ area*year_f + source_f + s(yday, by = area, bs = "cc"), 
            data = d,
            family = student(df = 5),
            spatial = "off",
            spatiotemporal = "off",
            knots = list(yday = c(0.5, 364.5)),
            control = sdmTMBcontrol(newton_loops = 1))

Try alternative degrees of freedom

Code
#df=20 (effectively gaussian)
m2 <- sdmTMB(temp ~ area*year_f + source_f + s(yday, by = area, bs = "cc"), 
            data = d,
            family = student(df = 20),
            spatial = "off",
            spatiotemporal = "off",
            knots = list(yday = c(0.5, 364.5)),
            control = sdmTMBcontrol(newton_loops = 1))

# df = 9
m3 <- sdmTMB(temp ~ area*year_f + source_f + s(yday, by = area, bs = "cc"), 
            data = d,
            family = student(df = 9),
            spatial = "off",
            spatiotemporal = "off",
            knots = list(yday = c(0.5, 364.5)),
            control = sdmTMBcontrol(newton_loops = 1))

# df=4
m4 <- sdmTMB(temp ~ area*year_f + source_f + s(yday, by = area, bs = "cc"), 
            data = d,
            family = student(df = 4),
            spatial = "off",
            spatiotemporal = "off",
            knots = list(yday = c(0.5, 364.5)),
            control = sdmTMBcontrol(newton_loops = 1))

# Plot all residuals
mcmc_res <- residuals(m, type = "mle-mcmc",
                      mcmc_samples = sdmTMBextra::predict_mle_mcmc(m,
                                                                   mcmc_iter = 201,
                                                                   mcmc_warmup = 200))
#> 
#> SAMPLING FOR MODEL 'tmb_generic' NOW (CHAIN 1).
#> Chain 1: 
#> Chain 1: Gradient evaluation took 0.048341 seconds
#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 483.41 seconds.
#> Chain 1: Adjust your expectations accordingly!
#> Chain 1: 
#> Chain 1: 
#> Chain 1: Iteration:   1 / 201 [  0%]  (Warmup)
#> Chain 1: Iteration:  20 / 201 [  9%]  (Warmup)
#> Chain 1: Iteration:  40 / 201 [ 19%]  (Warmup)
#> Chain 1: Iteration:  60 / 201 [ 29%]  (Warmup)
#> Chain 1: Iteration:  80 / 201 [ 39%]  (Warmup)
#> Chain 1: Iteration: 100 / 201 [ 49%]  (Warmup)
#> Chain 1: Iteration: 120 / 201 [ 59%]  (Warmup)
#> Chain 1: Iteration: 140 / 201 [ 69%]  (Warmup)
#> Chain 1: Iteration: 160 / 201 [ 79%]  (Warmup)
#> Chain 1: Iteration: 180 / 201 [ 89%]  (Warmup)
#> Chain 1: Iteration: 200 / 201 [ 99%]  (Warmup)
#> Chain 1: Iteration: 201 / 201 [100%]  (Sampling)
#> Chain 1: 
#> Chain 1:  Elapsed Time: 485.167 seconds (Warm-up)
#> Chain 1:                0.422298 seconds (Sampling)
#> Chain 1:                485.589 seconds (Total)
#> Chain 1:

mcmc_res20 <- residuals(m2, type = "mle-mcmc",
                        mcmc_samples = sdmTMBextra::predict_mle_mcmc(m2,
                                                                     mcmc_iter = 201,
                                                                     mcmc_warmup = 200))
#> 
#> SAMPLING FOR MODEL 'tmb_generic' NOW (CHAIN 1).
#> Chain 1: 
#> Chain 1: Gradient evaluation took 0.028375 seconds
#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 283.75 seconds.
#> Chain 1: Adjust your expectations accordingly!
#> Chain 1: 
#> Chain 1: 
#> Chain 1: Iteration:   1 / 201 [  0%]  (Warmup)
#> Chain 1: Iteration:  20 / 201 [  9%]  (Warmup)
#> Chain 1: Iteration:  40 / 201 [ 19%]  (Warmup)
#> Chain 1: Iteration:  60 / 201 [ 29%]  (Warmup)
#> Chain 1: Iteration:  80 / 201 [ 39%]  (Warmup)
#> Chain 1: Iteration: 100 / 201 [ 49%]  (Warmup)
#> Chain 1: Iteration: 120 / 201 [ 59%]  (Warmup)
#> Chain 1: Iteration: 140 / 201 [ 69%]  (Warmup)
#> Chain 1: Iteration: 160 / 201 [ 79%]  (Warmup)
#> Chain 1: Iteration: 180 / 201 [ 89%]  (Warmup)
#> Chain 1: Iteration: 200 / 201 [ 99%]  (Warmup)
#> Chain 1: Iteration: 201 / 201 [100%]  (Sampling)
#> Chain 1: 
#> Chain 1:  Elapsed Time: 286.605 seconds (Warm-up)
#> Chain 1:                0.338214 seconds (Sampling)
#> Chain 1:                286.944 seconds (Total)
#> Chain 1:

mcmc_res9 <- residuals(m3, type = "mle-mcmc",
                       mcmc_samples = sdmTMBextra::predict_mle_mcmc(m3,
                                                                    mcmc_iter = 201,
                                                                    mcmc_warmup = 200))
#> 
#> SAMPLING FOR MODEL 'tmb_generic' NOW (CHAIN 1).
#> Chain 1: 
#> Chain 1: Gradient evaluation took 0.017833 seconds
#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 178.33 seconds.
#> Chain 1: Adjust your expectations accordingly!
#> Chain 1: 
#> Chain 1: 
#> Chain 1: Iteration:   1 / 201 [  0%]  (Warmup)
#> Chain 1: Iteration:  20 / 201 [  9%]  (Warmup)
#> Chain 1: Iteration:  40 / 201 [ 19%]  (Warmup)
#> Chain 1: Iteration:  60 / 201 [ 29%]  (Warmup)
#> Chain 1: Iteration:  80 / 201 [ 39%]  (Warmup)
#> Chain 1: Iteration: 100 / 201 [ 49%]  (Warmup)
#> Chain 1: Iteration: 120 / 201 [ 59%]  (Warmup)
#> Chain 1: Iteration: 140 / 201 [ 69%]  (Warmup)
#> Chain 1: Iteration: 160 / 201 [ 79%]  (Warmup)
#> Chain 1: Iteration: 180 / 201 [ 89%]  (Warmup)
#> Chain 1: Iteration: 200 / 201 [ 99%]  (Warmup)
#> Chain 1: Iteration: 201 / 201 [100%]  (Sampling)
#> Chain 1: 
#> Chain 1:  Elapsed Time: 274.712 seconds (Warm-up)
#> Chain 1:                0.298227 seconds (Sampling)
#> Chain 1:                275.01 seconds (Total)
#> Chain 1:

mcmc_res4 <- residuals(m4, type = "mle-mcmc",
                       mcmc_samples = sdmTMBextra::predict_mle_mcmc(m4,
                                                                    mcmc_iter = 201,
                                                                    mcmc_warmup = 200))
#> 
#> SAMPLING FOR MODEL 'tmb_generic' NOW (CHAIN 1).
#> Chain 1: 
#> Chain 1: Gradient evaluation took 0.016611 seconds
#> Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 166.11 seconds.
#> Chain 1: Adjust your expectations accordingly!
#> Chain 1: 
#> Chain 1: 
#> Chain 1: Iteration:   1 / 201 [  0%]  (Warmup)
#> Chain 1: Iteration:  20 / 201 [  9%]  (Warmup)
#> Chain 1: Iteration:  40 / 201 [ 19%]  (Warmup)
#> Chain 1: Iteration:  60 / 201 [ 29%]  (Warmup)
#> Chain 1: Iteration:  80 / 201 [ 39%]  (Warmup)
#> Chain 1: Iteration: 100 / 201 [ 49%]  (Warmup)
#> Chain 1: Iteration: 120 / 201 [ 59%]  (Warmup)
#> Chain 1: Iteration: 140 / 201 [ 69%]  (Warmup)
#> Chain 1: Iteration: 160 / 201 [ 79%]  (Warmup)
#> Chain 1: Iteration: 180 / 201 [ 89%]  (Warmup)
#> Chain 1: Iteration: 200 / 201 [ 99%]  (Warmup)
#> Chain 1: Iteration: 201 / 201 [100%]  (Sampling)
#> Chain 1: 
#> Chain 1:  Elapsed Time: 312.43 seconds (Warm-up)
#> Chain 1:                0.372699 seconds (Sampling)
#> Chain 1:                312.803 seconds (Total)
#> Chain 1:

dres <- d |> mutate("df=5" = mcmc_res,
                    "df=20" = mcmc_res20,
                    "df=9" = mcmc_res9,
                    "df=4" = mcmc_res4) |> 
  select(`df=5`, `df=20`, `df=9`, `df=4`) |> 
  pivot_longer(everything())
#> mutate: new variable 'df=5' (double) with 95,071 unique values and 0% NA
#>         new variable 'df=20' (double) with 95,071 unique values and 0% NA
#>         new variable 'df=9' (double) with 95,071 unique values and 0% NA
#>         new variable 'df=4' (double) with 95,071 unique values and 0% NA
#> select: dropped 15 variables (year, area, temp, yday, month, …)
#> pivot_longer: reorganized (df=5, df=20, df=9, df=4) into (name, value) [was 98616x4, now 394464x2]

ggplot(dres, aes(sample = value)) +
  stat_qq(size = 0.75, shape = 21, fill = NA) +
  facet_wrap(~factor(name, levels = c("df=20", "df=9", "df=5", "df=4"))) +
  stat_qq_line() +
  labs(y = "Sample Quantiles", x = "Theoretical Quantiles") + 
  theme(aspect.ratio = 1)

Code
#AIC(m, m2)

Check fit

Code
sanity(m)
#> ✔ Non-linear minimizer suggests successful convergence
#> ✔ Hessian matrix is positive definite
#> ✔ No extreme or very small eigenvalues detected
#> ✔ No gradients with respect to fixed effects are >= 0.001
#> ✔ No fixed-effect standard errors are NA
#> ✔ No standard errors look unreasonably large
#> ✔ No sigma parameters are < 0.01
#> ✔ No sigma parameters are > 100
#> Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning
#> Inf
#> Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning
#> -Inf
#> Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning
#> Inf
#> Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning
#> -Inf

ggplot(d, aes(sample = mcmc_res)) +
  stat_qq() +
  stat_qq_line() +
  labs(y = "Sample Quantiles", x = "Theoretical Quantiles") + 
  theme(aspect.ratio = 1)

Code

ggsave(paste0(home, "/figures/supp/qq_temp.pdf"), width = 11, height = 11, units = "cm")

Predict

Code
# Make a new data frame and predict!
nd <- data.frame(expand.grid(yday = seq(min(d$yday), max(d$yday), by = 1),
                             area = unique(d$area),
                             year = unique(d$year))) |>
  mutate(source = "logger") |> 
  mutate(id = paste(year, area, sep = "_"),
         source_f = as.factor(source),
         year_f = as.factor(year)) 
#> mutate: new variable 'source' (character) with one unique value and 0% NA
#> mutate: new variable 'id' (character) with 996 unique values and 0% NA
#>         new variable 'source_f' (factor) with one unique value and 0% NA
#>         new variable 'year_f' (factor) with 83 unique values and 0% NA

# Predict
nd$pred <- predict(m, newdata = nd)$est

In order to have a lower temperature in before-nuclear times (without any data to inform that), we can use the nearby areas.. so FM informs BT prior to nuclear

Code
nd <- nd |> 
  mutate(keep = "Y",
         keep = ifelse(area == "BT" & year < 1980, "N", keep),
         keep = ifelse(area == "SI_HA" & year < 1972, "N", keep)) |>
  filter(keep == "Y")
#> mutate: new variable 'keep' (character) with 2 unique values and 0% NA
#> filter: removed 26,352 rows (7%), 338,184 rows remaining
  
nd_sub <- nd |> 
  mutate(keep = "N",
         keep = ifelse(area == "FM" & year < 1980, "Y", keep), # use FM instead of BT
         keep = ifelse(area == "SI_EK" & year < 1972, "Y", keep)) |> # use SI_EK instead of SI_HA
  filter(keep == "Y")
#> mutate: changed 311,832 values (92%) of 'keep' (0 new NA)
#> filter: removed 311,832 rows (92%), 26,352 rows remaining

# Now change the labels to BT and SI_EK...
nd_sub <- nd_sub |> 
  mutate(area = ifelse(area == "FM", "BT", "SI_HA"))
#> mutate: converted 'area' from factor to character (0 new NA)

# Bind rows and plot only the temperature series we will use for growth modelling
nd <- bind_rows(nd, nd_sub) |> select(-keep)
#> select: dropped one variable (keep)

Keep track of the years for which we have cohorts for matching with cohort data

Code
gdat <- readr::read_csv("https://raw.githubusercontent.com/maxlindmark/perch-growth/master/data/for-analysis/dat.csv")
#> Rows: 364546 Columns: 11
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> chr (5): age_ring, area, gear, ID, sex
#> dbl (6): length_mm, age_bc, age_catch, catch_year, cohort, final_length
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

gdat$area_cohort_age <- as.factor(paste(gdat$area, gdat$cohort, gdat$age_bc))

gdat <- gdat |>
  group_by(area_cohort_age) |> 
  filter(n() > 10)
#> group_by: one grouping variable (area_cohort_age)
#> filter (grouped): removed 5,488 rows (2%), 359,058 rows remaining

gdat <- gdat |> 
  filter(age_catch > 3) |> 
  group_by(area) |>
  summarise(min = min(cohort)) |> 
  arrange(min)
#> filter (grouped): removed 121,647 rows (34%), 237,411 rows remaining
#> group_by: one grouping variable (area)
#> summarise: now 12 rows and 2 columns, ungrouped

nd <- left_join(nd, gdat, by = "area") |>
  mutate(growth_dat = ifelse(year >= min, "Y", "N"))
#> left_join: added one column (min)
#>            > rows only in x         0
#>            > rows only in y  (      0)
#>            > matched rows     364,536
#>            >                 =========
#>            > rows total       364,536
#> mutate: new variable 'growth_dat' (character) with 2 unique values and 0% NA

Plot predictions

Code
nd |> 
  filter(growth_dat == "Y") |> 
  ggplot(aes(yday, y = year, fill = pred, color = pred)) +
    geom_raster() +
    facet_wrap(~area, ncol = 3) +
    scale_fill_viridis(option = "magma") +
    scale_color_viridis(option = "magma") +
    labs(x = "Yearday", y = "Year", color = "Predicted SST (°C)", fill = "Predicted SST (°C)")
#> filter: removed 157,380 rows (43%), 207,156 rows remaining

Growing season? This might be different for different areas…

Code
# Find day of the year where temperature exceeds 10C by area across years
gs_area <- nd |> 
  group_by(area, yday) |> 
  summarise(mean_pred = mean(pred)) |>
  ungroup() |> 
  filter(mean_pred > 10) |> 
  group_by(area) |> 
  summarise(gs_min = min(yday),
            gs_max = max(yday))
#> group_by: 2 grouping variables (area, yday)
#> summarise: now 4,392 rows and 3 columns, one group variable remaining (area)
#> ungroup: no grouping variables
#> filter: removed 2,501 rows (57%), 1,891 rows remaining
#> group_by: one grouping variable (area)
#> summarise: now 12 rows and 3 columns, ungrouped

nd <- left_join(nd, gs_area, by = "area")
#> left_join: added 2 columns (gs_min, gs_max)
#>            > rows only in x         0
#>            > rows only in y  (      0)
#>            > matched rows     364,536
#>            >                 =========
#>            > rows total       364,536

gs_area$mean_pred <- 10

# Plot!
nd |> 
  filter(growth_dat == "Y") |> 
  group_by(area, yday) |> 
  summarise(mean_pred = mean(pred)) |> 
  ggplot(aes(yday, mean_pred)) +
  geom_line() +
  labs(x = "Yearday", y = "Year", color = "Predicted SST (°C)", fill = "Predicted SST (°C)") + 
  facet_wrap(~area) +
  geom_point(data = gs_area, aes(gs_min, mean_pred), inherit.aes = FALSE, color = "tomato2") + 
  geom_point(data = gs_area, aes(gs_max, mean_pred), inherit.aes = FALSE, color = "tomato2") +
  geom_hline(yintercept = 10, linetype = 2)
#> filter: removed 157,380 rows (43%), 207,156 rows remaining
#> group_by: 2 grouping variables (area, yday)
#> summarise: now 4,392 rows and 3 columns, one group variable remaining (area)

Detailed exploration of predictions

Code
# Loop trough all areas, plot temperature as a function of yday, color by data source, facet by year

for(i in unique(nd$area)) {
  
  plotdat <- nd |> filter(area == i)
  
  print(
    ggplot(plotdat, aes(yday, pred, linetype = "Model prediction (logger)")) + 
      scale_color_brewer(palette = "Accent") + 
      facet_wrap(~year) + 
      geom_point(data = filter(d, area == i & year > min(plotdat$year)), aes(yday, temp, color = source),
                 size = 0.75) + 
      geom_line() + 
      labs(title = paste("Area = ", i), color = "", linetype = "") + 
      guides(color = guide_legend(title.position="top", title.hjust = 0.5)) + 
      theme_sleek(base_size = 8) +
      theme(legend.position = c(0.7, 0.03), 
            legend.direction = "horizontal",
            legend.spacing.y = unit(-0.3, "cm")) + 
      labs(x = "Day of the year", y = "Predicted SST (°C)")
  )
  
  ggsave(paste0(home, "/figures/supp/temp_pred_yday_area_", i, ".pdf" ), width = 17, height = 17, units = "cm")
  
}
#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 95,167 rows (97%), 3,449 rows remaining
#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 89,164 rows (90%), 9,452 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 89,155 rows (90%), 9,461 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 86,660 rows (88%), 11,956 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 90,466 rows (92%), 8,150 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 88,494 rows (90%), 10,122 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 87,743 rows (89%), 10,873 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 93,808 rows (95%), 4,808 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 89,755 rows (91%), 8,861 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 82,797 rows (84%), 15,819 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 95,212 rows (97%), 3,404 rows remaining

#> filter: removed 334,158 rows (92%), 30,378 rows remaining
#> filter: removed 96,499 rows (98%), 2,117 rows remaining

Plot summarized data and predictions

Code
dsum <- d |> 
  group_by(year, area, source) |> 
  summarise(temp = mean(temp)) |> 
  mutate(type = "data")
#> group_by: 3 grouping variables (year, area, source)
#> summarise: now 1,676 rows and 4 columns, 2 group variables remaining (year, area)
#> mutate (grouped): new variable 'type' (character) with one unique value and 0% NA

preds <- nd |> 
  filter(growth_dat == "Y" & source == "logger") |> 
  group_by(area, year) |> 
  summarise(temp = mean(pred)) |> 
  mutate(type = "model")
#> filter: removed 157,380 rows (43%), 207,156 rows remaining
#> group_by: 2 grouping variables (area, year)
#> summarise: now 566 rows and 3 columns, one group variable remaining (area)
#> mutate (grouped): new variable 'type' (character) with one unique value and 0% NA

# Now do by growing season instead
preds_gs <- nd |> 
  filter(growth_dat == "Y" & source == "logger") |> 
  filter(yday >= gs_min & yday <= gs_max) |> 
  group_by(area, year) |> 
  summarise(temp_gs = mean(pred))
#> filter: removed 157,380 rows (43%), 207,156 rows remaining
#> filter: removed 117,410 rows (57%), 89,746 rows remaining
#> group_by: 2 grouping variables (area, year)
#> summarise: now 566 rows and 3 columns, one group variable remaining (area)

preds <- left_join(preds, preds_gs, by = c("area", "year"))
#> left_join: added one column (temp_gs)
#>            > rows only in x     0
#>            > rows only in y  (  0)
#>            > matched rows     566
#>            >                 =====
#>            > rows total       566

ggplot(preds, aes(temp, temp_gs, color = area)) + 
  geom_point()

Code

ggplot(preds, aes(year, temp)) + 
  geom_point(data = dsum, aes(year, temp, color = source), size = 0.75, alpha = 0.75) + 
  scale_color_brewer(palette = "Accent") +
  geom_line(linewidth = 0.5, color = "grey20") + 
  facet_wrap(~area)

Now see if there is a systematic pattern in the difference between predicted and observed logger data, which could indicate that the source effect isn’t global but area-specific.

Code
dlog <- d |> 
  filter(source == "logger") |> 
  mutate(type = "data",
         id = paste(area, year, yday, sep = "_")) |> 
  select(id, temp) |> 
  group_by(id) |> 
  summarise(obs = mean(temp)) # sometimes we have more than 1 observation per id
#> filter: removed 34,632 rows (35%), 63,984 rows remaining
#> mutate: changed 63,984 values (100%) of 'id' (63984 fewer NA)
#>         new variable 'type' (character) with one unique value and 0% NA
#> select: dropped 14 variables (year, area, yday, month, date, …)
#> group_by: one grouping variable (id)
#> summarise: now 60,967 rows and 2 columns, ungrouped

# dlog |> 
#   group_by(id) |> 
#   summarise(n = n()) |> 
#   distinct(n)

preds_log <- nd |> 
  filter(growth_dat == "Y" & source == "logger") |> 
  mutate(type = "model",
         id = paste(area, year, yday, sep = "_")) |> 
  filter(id %in% unique(dlog$id)) |> 
  ungroup() |> 
  left_join(dlog, by = "id")
#> filter: removed 157,380 rows (43%), 207,156 rows remaining
#> mutate: changed 207,156 values (100%) of 'id' (0 new NA)
#>         new variable 'type' (character) with one unique value and 0% NA
#> filter: removed 146,189 rows (71%), 60,967 rows remaining
#> ungroup: no grouping variables
#> left_join: added one column (obs)
#>            > rows only in x        0
#>            > rows only in y  (     0)
#>            > matched rows     60,967
#>            >                 ========
#>            > rows total       60,967

preds_log |> 
  mutate(resid = pred - obs) |> 
  ggplot(aes(as.factor(area), resid, group = as.factor(area))) +
  #geom_jitter(alpha = 0.05, color = "grey20", height = 0, width = 0.2) + 
  geom_violin(fill = "grey70", color = NA) +
  geom_boxplot(width = 0.2, outlier.colour = NA, outlier.color = NA, outlier.fill = NA) +
  guides(color = "none") +
  geom_hline(yintercept = 0, linetype = 2, color = "tomato3", linewidth = 0.75) + 
  labs(x = "Area", y = "Manual residuals")
#> mutate: new variable 'resid' (double) with 60,967 unique values and 0% NA

Make final plot

Code
order <- preds |>
  group_by(area) |> 
  summarise(mean_temp = mean(temp)) |> 
  arrange(desc(mean_temp))
#> group_by: one grouping variable (area)
#> summarise: now 12 rows and 2 columns, ungrouped

order
#> # A tibble: 12 × 2
#>    area  mean_temp
#>    <chr>     <dbl>
#>  1 SI_HA     14.3 
#>  2 BT        13.2 
#>  3 TH        10.4 
#>  4 VN        10.0 
#>  5 SI_EK      9.32
#>  6 FM         9.06
#>  7 JM         8.66
#>  8 BS         8.64
#>  9 MU         8.64
#> 10 FB         8.17
#> 11 HO         6.97
#> 12 RA         6.81
# Save plot order..

topt <- 10.6 # Overall t_opt from 02-fit-vbge.qmd! Update if needed

# Add latitude
area <- c("BS", "BT", "FB", "FM", "HO", "JM", "MU", "RA", "SI_EK", "SI_HA", "TH", "VN")
nareas <- length(area)
lat <- c(60, 60.4, 60.3, 60.5, 63.7, 58, 59, 65.9, 57.3, 57.4, 56.1, 57.5)
lon <- c(21.5, 18.1, 19.5, 18, 20.9, 16.8, 18.1, 22.3, 16.6, 16.7, 15.9, 16.9)
area_attr <- data.frame(cbind(area = area, lat = lat, lon = lon)) |>
  mutate_at(c("lat","lon"), as.numeric) |> 
  arrange(desc(lat))
#> mutate_at: converted 'lat' from character to double (0 new NA)
#>            converted 'lon' from character to double (0 new NA)

ggplot(preds, aes(year, temp, color = temp)) + 
  facet_wrap(~factor(area, levels = area_attr$area), ncol = 3) + 
  geom_hline(yintercept = topt, linewidth = 0.3, linetype = 2, color = "grey") +
  geom_line() +
  labs(x = "Year", y = "Model-predicted annual average temperature") + 
  scale_color_viridis(option = "magma", name = "Area") +
  guides(color = "none") 

Code

preds |> 
  group_by(area) |> 
  summarise(min = min(year),
            max = max(year)) |> 
  arrange(min)
#> group_by: one grouping variable (area)
#> summarise: now 12 rows and 3 columns, ungrouped
#> # A tibble: 12 × 3
#>    area    min   max
#>    <chr> <dbl> <dbl>
#>  1 JM     1953  2022
#>  2 FM     1963  2022
#>  3 SI_HA  1966  2022
#>  4 SI_EK  1968  2022
#>  5 FB     1969  2022
#>  6 BT     1971  2022
#>  7 MU     1981  2022
#>  8 HO     1982  2022
#>  9 BS     1985  2022
#> 10 RA     1985  2022
#> 11 VN     1987  2022
#> 12 TH     2000  2022

ggsave(paste0(home, "/figures/annual_average_temperature.pdf"), width = 17, height = 17, units = "cm")
Code
# Save prediction df
write_csv(preds, paste0(home, "/output/gam_predicted_temps.csv"))

If we want to get uncertainty, we can use nsim instead; this simulates from the linear predictor using the inverse precision matrix, which is a fast way to get a distribution of samples from which we can take e.g. quantiles and means. However, it’s still slow, so the code below isn’t executed yet.

Code
nd_sim <- data.frame(expand.grid(yday = seq(min(d$yday), max(d$yday), by = 1),
                                 area = unique(d$area),
                                 year = unique(d$year))) |>
  mutate(source = "logger") |>
  mutate(id = paste(year, area, sep = "_"),
         source_f = as.factor(source),
         year_f = as.factor(year))

# Trim!
nd_sim <- left_join(nd_sim, gdat, by = "area")

nd_sim <- nd_sim |>
  mutate(growth_dat = ifelse(year > min, "Y", "N")) |>
  filter(growth_dat == "Y") |>
  filter(yday %in% c(gs_min:gs_min)) |>
  mutate(area = as.factor(area))

# Predict!
nsim <- 500
m_pred_sims <- predict(m, newdata = nd_sim, nsim = nsim)

# Join sims with prediction data
nd_sim_long <- cbind(nd_sim, as.data.frame(m_pred_sims)) |>
    pivot_longer(c((ncol(nd_sim) + 1):(nsim + ncol(nd_sim))))

# Summarize sims over growing season
sum_pred_gs <- nd_sim_long |>
    ungroup() |>
    group_by(year, area) |>
    summarise(lwr = quantile(value, prob = 0.1),
              est = quantile(value, prob = 0.5),
              upr = quantile(value, prob = 0.9)) |>
    ungroup()

# In order to have a lower temperature in before-nuclear times (without any data to inform that), we can use the nearby areas..
sum_pred_gs <- preds |>
  mutate(keep = "Y",
         keep = ifelse(area == "BT" & year < 1980, "N", keep),
         keep = ifelse(area == "SI_HA" & year < 1972, "N", keep)) |>
  filter(keep == "Y")

sum_pred_gs_sub <- preds |>
  mutate(keep = "N",
         keep = ifelse(area == "FM" & year < 1980, "Y", keep), # use FM instead of BT
         keep = ifelse(area == "SI_EK" & year < 1972, "Y", keep)) |> # use SI_EK instead of SI_HA
  filter(keep == "Y")

# Now change the labels to BT and SI_EK...
sum_pred_gs_sub <- sum_pred_gs_sub |>
  mutate(area = ifelse(area == "FM", "BT", "SI_HA"))

# Bind rows and plot only the temperature series we will use for growth modelling
sum_pred_gs <- bind_rows(sum_pred_gs, sum_pred_gs_sub) |> select(-keep, -type)

order <- sum_pred_gs |>
  group_by(area) |>
  summarise(mean_temp = mean(temp)) |>
  arrange(desc(mean_temp))